Sys.time()
rm(list = ls())
setwd([PATH])
source("0_Initiation_Programs.R",encoding = "UTF-8") ; gc()
cl<-makeCluster([CORES],type="SOCK", outfile="")
registerDoSNOW(cl)

Clean_Course_Data<-T





load(paste0(secure_derived,"uadm_data_extract_processed.Rda"))
A<-Data ; rm(Data) ; gc()


###
#  NSC degree attainment indicators
###
A$UnivGrad_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1&A$NSC_Grad_Year<=A$YEARAPAY+5)) #Five-Year grad rate
A$UnivGrad_TwoYears_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1&A$NSC_Grad_Year<=A$YEARAPAY+2))
A$UnivGrad_ThreeYears_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1&A$NSC_Grad_Year<=A$YEARAPAY+3))
A$UnivGrad_FourYears_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1&A$NSC_Grad_Year<=A$YEARAPAY+4))
A$UnivGrad_SixYears_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1&A$NSC_Grad_Year<=A$YEARAPAY+6))
A$UnivGrad_Ever_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1))
A$Num_Years_NSCOnly<-A$NSC_Grad_Year-A$YEARAPAY
A$Num_Years_NSCOnly_WithinSix[NA_to_F(A$Num_Years_NSCOnly<7)]<-A$Num_Years_NSCOnly[NA_to_F(A$Num_Years_NSCOnly<7)]

###
#  Classify majors into disciplines and STEM
###
majors<-read_csv("Data/Derived/NSC_Major_Codes_Categorized.csv")
  load("Data/Derived/NSC_STEM_fromCIP.Rda") ; majors$STEM<-majors$Major%in%STEM ; majors<-rename(majors,NSC_STEM=STEM)

majors$NSC_Art<-majors$MajorSimp%in%c("Art","Creative Writing","Music","Theater") ; majors$NSC_Art[is.na(majors$NSC_Art)&!is.na(majors$MajorSimp)]<-0
majors$NSC_Hum<-majors$MajorSimp%in%c("Art History","Chinese","Classics","English","Film","French","German","History","Japanese","Other Humanities","Other Languages","Philosophy","Religion","Russian","Spanish") ; majors$NSC_Hum[is.na(majors$NSC_Hum)&!is.na(majors$MajorSimp)]<-0
majors$NSC_SS<-majors$MajorSimp%in%c("Anthropology","Ethnic Studies","Geography","International Studies","Law","Linguistics","Other Social Sciences","Political Science","Psychology","Sociology","Urban Planning","Women's Studies") ; majors$NSC_SS[is.na(majors$NSC_SS)&!is.na(majors$MajorSimp)]<-0
majors$NSC_Sci<-majors$MajorSimp%in%c("Biochemistry","Biology","Biophysics","Chemistry","Cognitive Science","Environmental Studies","Geology","Mathematics","Neuroscience","Other Natural Sciences","Physics","Statistics") ; majors$NSC_Sci[is.na(majors$NSC_Sci)&!is.na(majors$MajorSimp)]<-0
majors$NSC_Eng<-majors$MajorSimp%in%c("Bioengineering","Chemical Engineering","Civil Engineering","Computer Engineering","Computer Science","Computer Science Engineering","Electrical Engineering","Industrial Engineering","Information","Materials Science","Mechanical Engineering","Other Engineering") ; majors$NSC_Eng[is.na(majors$NSC_Eng)&!is.na(majors$MajorSimp)]<-0
majors$NSC_Prof<-majors$MajorSimp%in%c("Agriculture","Architecture","Communications","Criminology","Design","Education","Journalism","Kinesiology","Nursing","Nutrition","Other Health Sciences","Other Professional","Public Health","Public Policy","Social Welfare","Speech Pathology","Veterinary Medicine") ; majors$NSC_Prof[is.na(majors$NSC_Prof)&!is.na(majors$MajorSimp)]<-0
majors$NSC_Bus<-majors$MajorSimp%in%c("Accounting","Business","Economics","Finance","Marketing","Math Economics") ; majors$NSC_Bus[is.na(majors$NSC_Bus)&!is.na(majors$MajorSimp)]<-0
majors<-majors[!duplicated(majors$Major),]

t<-A$NSC_Grad_Major ; t<-strsplit(t,";;") ; t<-lapply(t,function(x) gsub("[ -][(]?\\s*[A-z]{2,3}\\s*[)]?\\s*$|BACHELOR\\s*OF\\s*(([A-z]*)\\s*IN\\s*|ARTS[ -]+|SCIENCE[ -]+)?|^MAJOR:\\s*|^[BA][AS]\\s+|^[A-Z][. ]*[A-Z][-:. ]+|[-:. ]+[A-Z][. ]*[A-Z][-:. ]*$","",x)) ; t<-lapply(t,function(x) if(is.na(x)[1]) c("","","") else x) ; A<-cbind(A,data.frame(NSC_Major1=unlist(lapply(t,function(x) x[1])),NSC_Major2=unlist(lapply(t,function(x) x[2])),NSC_Major3=unlist(lapply(t,function(x) x[3])))) ; rm(t) #Ignore fourth major if present
A$NSC_Grad_MajorSimp<-NA
for(i in 1:3){
  majors[,paste0("NSC_Major",i)]<-majors$Major ; A[,paste0("NSC_Major",i)]<-as.character(A[,paste0("NSC_Major",i)])
  A<-merge(A,majors[,grepl(paste0("NSC_[^M]|MajorSimp|Major",i),names(majors))],all.x=T)
  A[,paste0("NSC_MajorSimp",i)]<-A$MajorSimp
  A$NSC_Grad_MajorSimp[is.na(A$NSC_Grad_MajorSimp)]<-A$MajorSimp[is.na(A$NSC_Grad_MajorSimp)] ; A<-A[,names(A)!="MajorSimp"]
  for(v in c("STEM","Art","Hum","SS","Sci","Eng","Prof","Bus")){
    A[NA_to_F(A[,paste0("NSC_",v)]==1),paste0("NSC_Grad_",v)]<-1
    A<-A[,names(A)!=paste0("NSC_",v)]
  }
}
for(v in c("STEM","Art","Hum","SS","Sci","Eng","Prof","Bus")){
  A[is.na(A[,paste0("NSC_Grad_",v)])&!is.na(A$NSC_Grad_MajorSimp),paste0("NSC_Grad_",v)]<-0
} #Conditional degree major indicators
for(v in c("Art","Hum","SS","Sci","Eng","Prof","Bus","STEM")){ 
  A[NA_to_F(A$NSC_Grad_Art+A$NSC_Grad_Hum+A$NSC_Grad_SS+A$NSC_Grad_Sci+A$NSC_Grad_Eng+A$NSC_Grad_Prof+A$NSC_Grad_Bus==0),paste0("NSC_Grad_",v)]<-NA
}

for(v in c("STEM","Art","Hum","SS","Sci","Eng","Prof","Bus")){
  A[NA_to_F(A$UnivGrad_NSCOnly==1),paste0(v,"_NSCOnly")]<-A[NA_to_F(A$UnivGrad_NSCOnly==1),paste0("NSC_Grad_",v)] #Earn degree in field, conditional on graduating
  A[NA_to_F(A$UnivGrad_NSCOnly==1)&NA_to_F(A$Pros_Major_Sum==0),paste0(v,"_Undec_NSCOnly")]<-A[NA_to_F(A$UnivGrad_NSCOnly==1)&NA_to_F(A$Pros_Major_Sum==0),paste0("NSC_Grad_",v)] #Earn degree in field, conditional on graduating AND applying to UC undeclared (at all campuses)
  A[,paste0(v,"_Uncondit_NSCOnly")]<-NA_to_F(A[,paste0("NSC_Grad_",v)]==1)&NA_to_F(A$UnivGrad_NSCOnly==1) #Earn degree in field within 5 years, unconditional on graduating
}


###
#  Merge in data from IPEDS
###
#Merge in institutional graduation rate, merged on OPEID
grad<-read.csv("Data/Raw/IPEDS_GradRate_AA.csv")
  names(grad)<-gsub("Grand.total..GR([0-9]+).*Completers.*150.*$","NG6_\\1",names(grad))
  names(grad)<-gsub("Grand.total..GR([0-9]+).*Completers.*4.*$","NG4_\\1",names(grad))
  names(grad)<-gsub("Grand.total..GR([0-9]+).*revised.*$","BG_\\1",names(grad))
  grad$OPEID<-gsub("(.{2})$","-\\1",grad$Office.of.Postsecondary.Education.ID.Number..FA2000HD.)
  for(v in names(grad)) grad[is.na(grad[,v]),v]<-0
  grad<-aggregate(.~OPEID,grad[!NA_to_T(grad$OPEID==""),grepl("^[NB]G|OPEID",names(grad))],sum,na.rm=T)
  for(i in c(4,6)) for(y in 1997:2002) grad[,paste0("G",i,".",y)]<-grad[,paste0("NG",i,"_",y)]/grad[,paste0("BG_",y)]
  grad<-grad[,grepl("^G[0-9]|OPEID",names(grad))] ; grad<-grad[,sort(names(grad))]
  grad<-reshape(grad,v.names=c("G4","G6"),sep=".",direction="long",idvar="OPEID",varying=list(1:6,7:12))
  grad<-grad[!(is.na(grad$G4)&is.na(grad$G6)),]
  grad<-rename(grad,YEARAPAY=time,NSC_Enr_IPEDS_GradRateFour=G4,NSC_Enr_IPEDS_GradRateSix=G6)
  grad$YEARAPAY<-grad$YEARAPAY+1996
  temp<-grad[grad$YEARAPAY==1997,] ; temp$YEARAPAY<-1996 ; grad<-rbind(grad,temp) #Create 1996, replicating 1997 (since otherwise unavailable)
  temp<-grad[grad$YEARAPAY==1997,] ; temp$YEARAPAY<-1995 ; grad<-rbind(grad,temp) #Create 1995, replicating 1997 (since otherwise unavailable)
A<-merge(A,grad[,c("OPEID","YEARAPAY","NSC_Enr_IPEDS_GradRateFour")],by.x=c("NSC_Enr_College_Code","YEARAPAY"),by.y=c("OPEID","YEARAPAY"),all.x=T)
A<-merge(A,grad[,c("OPEID","YEARAPAY","NSC_Enr_IPEDS_GradRateSix")],by.x=c("NSC_Enr_College_Code","YEARAPAY"),by.y=c("OPEID","YEARAPAY"),all.x=T)
names(grad)<-gsub("NSC_Enr","NSC_Grad",names(grad))
A<-merge(A,grad[,c("OPEID","YEARAPAY","NSC_Grad_IPEDS_GradRateFour")],by.x=c("NSC_Grad_College_Code","YEARAPAY"),by.y=c("OPEID","YEARAPAY"),all.x=T)
A<-merge(A,grad[,c("OPEID","YEARAPAY","NSC_Grad_IPEDS_GradRateSix")],by.x=c("NSC_Grad_College_Code","YEARAPAY"),by.y=c("OPEID","YEARAPAY"),all.x=T)
names(grad)<-gsub("NSC_Grad","NSC_EnrFirst",names(grad))
A<-merge(A,grad[,c("OPEID","YEARAPAY","NSC_EnrFirst_IPEDS_GradRateFour")],by.x=c("NSC_EnrFirst_College_Code","YEARAPAY"),by.y=c("OPEID","YEARAPAY"),all.x=T)
A<-merge(A,grad[,c("OPEID","YEARAPAY","NSC_EnrFirst_IPEDS_GradRateSix")],by.x=c("NSC_EnrFirst_College_Code","YEARAPAY"),by.y=c("OPEID","YEARAPAY"),all.x=T)

ipeds<-read.csv("Data/Raw/IPEDS_Data.csv",stringsAsFactors = F)
  ipeds$OPEID<-gsub("(.{2})$","-\\1",ipeds$Office.of.Postsecondary.Education.ID.Number..FA2001HD.)
  ipeds<-ipeds[!ipeds$OPEID%in%c("-2    -  ",""),]
select<-ipeds[,c("OPEID","Admissions.total..IC2001.","Applicants.total..IC2001.")]
  select$NSC_Enr_IPEDS_AdmRate<-select$Admissions.total..IC2001./select$Applicants.total..IC2001.*100
  select$NSC_Enr_IPEDS_AdmRate[NA_to_F(select$NSC_Enr_IPEDS_AdmRate>100)]<-NA
A<-merge(A,select[,c("OPEID","NSC_Enr_IPEDS_AdmRate")],by.x=c("NSC_Enr_College_Code"),by.y=c("OPEID"),all.x=T)
names(select)<-gsub("NSC_Enr","NSC_Grad",names(select))
A<-merge(A,select[,c("OPEID","NSC_Grad_IPEDS_AdmRate")],by.x=c("NSC_Grad_College_Code"),by.y=c("OPEID"),all.x=T)
names(select)<-gsub("NSC_Grad","NSC_EnrFirst",names(select))
A<-merge(A,select[,c("OPEID","NSC_EnrFirst_IPEDS_AdmRate")],by.x=c("NSC_EnrFirst_College_Code"),by.y=c("OPEID"),all.x=T)

sat<-ipeds[,grepl("OPEID|SAT",names(ipeds))]
  sat$NSC_Enr_IPEDS_SAT1600<-(sat$SAT.I.Verbal.25th.percentile.score..IC2001.+sat$SAT.1.Verbal.75th.percentile.score..IC2001.)/2+(sat$SAT.1.Math.25th.percentile.score..IC2001.+sat$SAT.1.Math.75th.percentile.score..IC2001.)/2
A<-merge(A,sat[,grepl("OPEID|NSC_Enr",names(sat))],by.x=c("NSC_Enr_College_Code"),by.y=c("OPEID"),all.x=T)
names(sat)<-gsub("NSC_Enr","NSC_Grad",names(sat))
A<-merge(A,sat[,grepl("OPEID|NSC_Grad",names(sat))],by.x=c("NSC_Grad_College_Code"),by.y=c("OPEID"),all.x=T)
names(sat)<-gsub("NSC_Grad","NSC_EnrFirst",names(sat))
A<-merge(A,sat[,grepl("OPEID|NSC_EnrFirst",names(sat))],by.x=c("NSC_EnrFirst_College_Code"),by.y=c("OPEID"),all.x=T)


###
#  Build enrollment indicators using NSC data
###
A$ENR10[is.na(A$ENR10)]<-0
ucs<-c("UNIVERSITY OF CALIFORNIA-BERKELEY|UNIVERSITY OF CALIFORNIA - BERKELEY","UNIVERSITY OF CALIFORNIA-DAVIS","UNIVERSITY OF CALIFORNIA-LOS ANGELES","UNIVERSITY OF CALIFORNIA - RIVERSIDE","UNIVERSITY OF CALIFORNIA-SAN DIEGO","UNIVERSITY OF CALIFORNIA-SANTA CRUZ","UNIVERSITY OF CALIFORNIA-SANTA BARBARA","UNIVERSITY OF CALIFORNIA-IRVINE|UNIVERSITY OF CALIFORNIA - IRVINE","UNIVERSITY OF CALIFORNIA-MERCED|UNIVERSITY OF CALIFORNIA - MERCED")
count<-1
for(i in c("01","03","04","05","06","07","08","09","10")){
  A[,paste0("ENR",i,"_Orig")]<-A[,paste0("ENR",i)]
  A[NA_to_F(grepl(ucs[count],A$NSC_Enr_College_Name)),paste0("ENR",i)]<-1
  count<-count+1
}
A$UCAdm<-NA_to_F(A$ADM01==1)|NA_to_F(A$ADM03==1)|NA_to_F(A$ADM04==1)|NA_to_F(A$ADM05==1)|NA_to_F(A$ADM06==1)|NA_to_F(A$ADM07==1)|NA_to_F(A$ADM08==1)|NA_to_F(A$ADM09==1)|NA_to_F(A$ADM10==1)

#Partition enrollment institutions into categories
A$UCEnr<-as.integer(apply(A[,grepl("ENR[01]",names(A))],1,sum,na.rm=T)>0)
A$CCEnr<-NA_to_F(!A$NSC_Enr_FourYear)&A$UCEnr==F
A$CSUEnr<-NA_to_F(grepl("CALIFORNIA STATE UNIV|CALIFORNIA POLYTECHNIC STATE|CALIFORNIA STATE POLY|SAN DIEGO STATE|SAN FRANCISCO STATE UN|SAN JOSE STATE UNIV|HUMBOLDT STATE UNI|SONOMA STATE UNIV|CALIFORNIA MARITIME ACADEMY",A$NSC_Enr_College_Name))&A$UCEnr==F

for(v in names(A)[grepl("College_Code$",names(A))]) A[,v]<-as.character(as.integer(gsub(".{3}$","",A[,v])))
A$HighUCEnrFirstIncCC<-NA_to_F(A$NSC_EnrFirstIncCC_College_Code%in%c(1312,1315,1317))&NA_to_F(!grepl("UNIVERSITY OF CALIFORNIA-EXTENSION",A$NSC_EnrFirstIncCC_College_Name))
A$MidUCEnrFirstIncCC<-NA_to_F(A$NSC_EnrFirstIncCC_College_Code%in%c(1313,1314,1320))
A$LowUCEnrFirstIncCC<-NA_to_F(A$NSC_EnrFirstIncCC_College_Code%in%c(1316,1321)) #These all get messed up somehow...
A$CCEnrFirstIncCC<-NA_to_F(A$NSC_EnrFirstIncCC_College_Code%in%A$NSC_Enr_College_Code[A$CCEnr==1])|(NA_to_F(A$NSC_EnrFirstIncCC_College_State=="CA"&grepl("COMMUNITY|CITY COLL|EVERGREEN VALLEY",A$NSC_EnrFirstIncCC_College_Name)))
A$CSUEnrFirstIncCC<-NA_to_F(A$NSC_EnrFirstIncCC_College_Code%in%A$NSC_Enr_College_Code[A$CSUEnr==1])|A$NSC_EnrFirstIncCC_College_Name%in%c("CALIFORNIA STATE UNIVERSITY - LOS ANGELES","CALIFORNIA STATE UNIVERSITY - BAKERSFIELD","CALIFORNIA STATE UNIVERSITY MARITIME ACADEMY")
A$IvyEnrFirstIncCC<-A$NSC_EnrFirstIncCC_College_Name%in%c("UNIVERSITY OF PENNSYLVANIA","YALE UNIVERSITY","CORNELL UNIVERSITY","COLUMBIA UNIVERSITY","HARVARD UNIVERSITY","PRINCETON UNIVERSITY","BROWN UNIVERSITY","DARTMOUTH COLLEGE","UNIVERSITY OF CHICAGO","MASSACHUSETTS INSTITUTE OF TECHNOLOGY","STANFORD UNIVERSITY")
A$NonCAEnrFirstIncCC<-NA_to_T(!A$NSC_EnrFirstIncCC_College_State%in%c(6,"CA"))&!is.na(A$NSC_EnrFirstIncCC_College_Name)&!grepl("(^|\\s)(LAW|HEALTH\\sSCIENCES|EXTENSION)($|\\s)",A$NSC_EnrFirstIncCC_College_Name)&!A$IvyEnrFirstIncCC
A$CAPrivEnrFirstIncCC<-(NA_to_F(A$NSC_EnrFirstIncCC_College_State%in%c(6,"CA")&!(A$HighUCEnrFirstIncCC|A$MidUCEnrFirstIncCC|A$LowUCEnrFirstIncCC|A$CCEnrFirstIncCC|A$CSUEnrFirstIncCC)))&!grepl("(^|\\s)(LAW|HEALTH\\sSCIENCES|EXTENSION)($|\\s)",A$NSC_EnrFirstIncCC_College_Name)&!A$IvyEnrFirstIncCC
A$NonEnrFirstIncCC<-!(A$HighUCEnrFirstIncCC==1|A$MidUCEnrFirstIncCC==1|A$LowUCEnrFirstIncCC==1|A$CCEnrFirstIncCC==1|A$CSUEnrFirstIncCC==1|A$CAPrivEnrFirstIncCC==1|A$NonCAEnrFirstIncCC==1|A$IvyEnrFirstIncCC==1)


###
#  General data cleaning
###

A$Female<-A$SEX=="F"
A$URM<-A$CETHNICA%in%c("A","B","C","J")
A$income_parent<-winsor(A$income_parent,0.01) #Winsorize at 1%
A$income_parent_zeros<-A$income_parent
A$logincome_parent_zeros<-log(A$income_parent) ; A$logincome_parent_zeros[is.na(A$logincome_parent_zeros)|is.infinite(A$logincome_parent_zeros)]<-0
A$income_parent[A$income_parent==0]<-NA ; A$income_Missing<-is.na(A$income_parent)

A$SATII_M<-A$SATII_M2 ; A$SATII_M[is.na(A$SATII_M)]<-A$SATII_M1[is.na(A$SATII_M)]
A$SATII_M2_Indic<-!is.na(A$SATII_M2)
A$SATII_Other<-A$SATII_Score_1
A$SATII_Other[NA_to_F(A$SATII_Score_1<A$SATII_Score_2)]<-A$SATII_Score_2[NA_to_F(A$SATII_Score_1<A$SATII_Score_2)]
A$SATII_M[NA_to_F(A$SATII_M1>A$SATII_M2)]<-A$SATII_M1[NA_to_F(A$SATII_M1>A$SATII_M2)]
A$hsgpa_censor<-A$hsgpa ; A$hsgpa_censor[NA_to_F(A$hsgpa_censor>4)]<-4
A$AIS<-round((A$hsgpa_censor*1000+A$SATIV+A$SATIM+A$SATII_W+A$SATII_M+A$SATII_Other)/10)*10
A$S<-A$AIS




#Parental Education categories
hi<-read.csv(paste0(secure,"PARENT_EDUCATION_RESIDENCY_D_201712061428.csv"))
  hi$educ_lvl_max<-apply(hi[,c(4,7)],1,max,na.rm=T) ; names(hi)[1]<-"PAR_EDU_RSDNCY_KEY" ; hi<-hi[,c("PAR_EDU_RSDNCY_KEY","educ_lvl_max","PAR_EDU_RSDNCY_PAR1_EDU_LVL_UCAP_CD","PAR_EDU_RSDNCY_PAR2_EDU_LVL_UCAP_CD")] ; hi$PAR_EDU_RSDNCY_KEY<-as.numeric(gsub("[^0-9]","",hi$PAR_EDU_RSDNCY_KEY))
A<-merge(A,hi)
A$Parent_Education_Codes<-factor(paste0(A$PAR_EDU_RSDNCY_PAR1_EDU_LVL_UCAP_CD,A$PAR_EDU_RSDNCY_PAR2_EDU_LVL_UCAP_CD))
A$educ_lvl_max[is.infinite(A$educ_lvl_max)|is.na(A$educ_lvl_max)]<-0 ; A$educ_lvl_max<-factor(A$educ_lvl_max)

#Indicator for applicants who are never determined to be UC-ineligible by any UC campus
A$NotIneligible<-!(( NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_BK_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_DV_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_LA_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_RV_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_SD_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_SC_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_SB_DECN_DESC))+
                       NA_to_F(grepl("^Not Eligible|^Eligible - Special",A$ELGBLITY_CMP_IR_DECN_DESC)))>0 & 
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_BK_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_DV_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_LA_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_RV_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_SD_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_SC_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_SB_DECN_DESC))+
                     NA_to_F(grepl("^Eligible - REGULAR",A$ELGBLITY_CMP_IR_DECN_DESC))==0)

###
#  Create dummy and non-missing variables for OLS regressions
###
A$SATIM_nom<-A$SATIM ; A$SATIM_nom[is.na(A$SATIM_nom)]<-0 ; A$SATIM_m<-is.na(A$SATIM)
A$SATIV_nom<-A$SATIV ; A$SATIV_nom[is.na(A$SATIV_nom)]<-0 ; A$SATIV_m<-is.na(A$SATIV)
A$SATIIW_nom<-A$SATIIW ; A$SATIIW_nom[is.na(A$SATIIW_nom)]<-0 ; A$SATIIW_m<-is.na(A$SATIIW)
A$SATII_M_nom<-A$SATII_M ; A$SATII_M_nom[is.na(A$SATII_M_nom)]<-0 ; A$SATII_M_m<-is.na(A$SATII_M)
A$SATII_Other_nom<-A$SATII_Other ; A$SATII_Other_nom[is.na(A$SATII_Other_nom)]<-0 ; A$SATII_Other_m<-is.na(A$SATII_Other)
A$SATII_M2_Indic_nom<-A$SATII_M2_Indic ; A$SATII_M2_Indic_nom[is.na(A$SATII_M2_Indic_nom)]<-2
A$SATII_1_nom<-A$SATII_1 ; A$SATII_1_nom[is.na(A$SATII_1_nom)]<-2



###
#  Predict ethnicity for non-reporters
###
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("P","5. White")]<-"White"
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("B","1. African American")]<-"Black"
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("A","K","M","2. American Indian","7. International")]<-"Other"
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("C","J","3. Hispanic/Latino")]<-"Hispanic"
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("D","F","G","H","L","N","V","4. Asian/Pacific Islander")]<-"Asian"
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("L")]<-"Asian" #Filipino
A$CETHNICA_Cat_HS[A$CETHNICA%in%c("E","","6. Unknown")]<-"Decline"
for(v in c("White","Asian","Black","Hispanic")) A[,paste0(v,"_HS")]<-A$CETHNICA_Cat_HS==v
#Predict ethnicity for "decline" students using LOO characteristics
for(n in c("FName","MName","LName","CPREVSCH","PERM_ZIP","GEOID10")){
  A$Name<-A[,n]
  temp<-A ; if(n%in%c("CPREVSCH","PERM_ZIP")) temp<-A[A$YEARAPAY%in%1994:2001,]
  hi<-aggregate(.~Name,temp[temp$CETHNICA_Cat_HS!="Decline",c("Name","White_HS","Asian_HS","Black_HS","Hispanic_HS")],sum)
  names(hi)[-1]<-paste0(names(hi)[-1],"_sum")
  A<-merge(A,hi,all.x=T)
  for(v in c("White_HS","Asian_HS","Black_HS","Hispanic_HS")){
    A[,paste0(v,"_",n)]<-(A[,paste0(v,"_sum")]-A[,v])/(A[,paste0("White_HS_sum")]+A[,paste0("Asian_HS_sum")]+A[,paste0("Hispanic_HS_sum")]+A[,paste0("Black_HS_sum")]-(A$CETHNICA_Cat_HS%in%c("White","Black","Hispanic","Asian"))) #NOTE: If using for post-2001 prediction, have to only subtract off if 1994:2001 here! Doesn't matter if only using for that period
    A[,paste0(v,"_",n,"_Empty")]<-is.na(A[,paste0(v,"_",n)])|is.infinite(A[,paste0(v,"_",n)]) #Note: Infinite shouldn't happen; seems like an R error
    A[is.na(A[,paste0(v,"_",n)])|is.infinite(A[,paste0(v,"_",n)]),paste0(v,"_",n)]<-0
  } 
  A<-A[,!grepl("_sum$",names(A))]
}
#Choose 10% to hold out for testing
set.seed(1) ; A$Random<-runif(nrow(A)) ; gc()
reg<-multinom(CETHNICA_Cat_HS ~ White_HS_FName+Asian_HS_FName+Hispanic_HS_FName+Black_HS_FName+
                White_HS_MName+Asian_HS_MName+Hispanic_HS_MName+Black_HS_MName+
                White_HS_LName+Asian_HS_LName+Hispanic_HS_LName+Black_HS_LName+
                White_HS_CPREVSCH+Asian_HS_CPREVSCH+Hispanic_HS_CPREVSCH+Black_HS_CPREVSCH+
                White_HS_PERM_ZIP+Asian_HS_PERM_ZIP+Hispanic_HS_PERM_ZIP+Black_HS_PERM_ZIP+
                White_HS_GEOID10+Asian_HS_GEOID10+Hispanic_HS_GEOID10+Black_HS_GEOID10+
                White_HS_FName_Empty+
                White_HS_MName_Empty+
                White_HS_LName_Empty+
                White_HS_CPREVSCH_Empty+
                White_HS_GEOID10_Empty+
                White_HS_PERM_ZIP_Empty, 
              data = A[A$CQUARTERAP==2&A$adm_type=="Freshman"&A$SEX%in%c("F","M")&A$CETHNICA_Cat_HS%in%c("White","Asian","Black","Hispanic")&A$Random>=.1,],maxit=1000)
pred<-data.frame(predict(reg,A[A$CQUARTERAP==2&A$adm_type=="Freshman"&A$SEX%in%c("F","M"),],type="probs")) ; names(pred)<-paste0(names(pred),"_Pred") ; pred<-cbind(pred,A[A$CQUARTERAP==2&A$adm_type=="Freshman"&A$SEX%in%c("F","M"),"AL_KEY"]) ; names(pred)[5]<-"AL_KEY"
A<-merge(A[,!grepl("_Pred$",names(A))],pred,all.x=T)
A$CETHNICA_Cat_Pred<-A$CETHNICA_Cat_HS
A$CETHNICA_Cat_Pred[A$CETHNICA_Cat_HS=="Decline"&A$White_Pred>.75]<-"White"
A$CETHNICA_Cat_Pred[A$CETHNICA_Cat_HS=="Decline"&A$Asian_Pred>.75]<-"Asian"
A$CETHNICA_Cat_Pred[A$CETHNICA_Cat_HS=="Decline"&A$Hispanic_Pred>.75]<-"Hispanic"
A$CETHNICA_Cat_Pred[A$CETHNICA_Cat_HS=="Decline"&A$Black_Pred>.75]<-"Black"
#Note: Types 1 and 2 error in the paper reported for freshman sample constructed below.
A$EthGen<-paste0(A$CETHNICA_Cat_Pred,"_",A$SEX)
A<-A[,!grepl("_HS_[A-Z]",names(A))]


###
#  Calculate high school control function
###
G<-read.csv("Data/Derived/Graduation.csv",stringsAsFactors = F)[,-1]
  for(v in names(G)[grepl("Asian",names(G))]) G[,v]<-G[,v]-G[,gsub("Asian","Filipino",v)]
  G<-ddply(G,.(CDS_CODE),function(x){
    y<-min(x$Year[(x$Total_M_UC+x$Total_F_UC)>0])
    return(x[x$Year>=y,])
  })
  G<-G[G$Total_M_UC+G$Total_F_UC>0,] #Lose another 1,500
  G$HS_County[nchar(G$CDS_CODE)==13]<-as.integer(substr(G$CDS_CODE[nchar(G$CDS_CODE)==13],1,1))
  G$HS_County[nchar(G$CDS_CODE)==14]<-as.integer(substr(G$CDS_CODE[nchar(G$CDS_CODE)==14],1,2))
  G$HS_District[nchar(G$CDS_CODE)==13]<-as.integer(substr(G$CDS_CODE[nchar(G$CDS_CODE)==13],2,6))
  G$HS_District[nchar(G$CDS_CODE)==14]<-as.integer(substr(G$CDS_CODE[nchar(G$CDS_CODE)==14],3,7))
  G$HS_School[nchar(G$CDS_CODE)==13]<-as.integer(substr(G$CDS_CODE[nchar(G$CDS_CODE)==13],7,13))
  G$HS_School[nchar(G$CDS_CODE)==14]<-as.integer(substr(G$CDS_CODE[nchar(G$CDS_CODE)==14],8,14))
  G$YEARAPAY<-G$Year
  G$Asian_M_UC<-G$Asian_M_UC+G$Filipino_M_UC ; G$Asian_F_UC<-G$Asian_F_UC+G$Filipino_F_UC
  G$Other_M_UC<-G$Total_M_UC-G$Asian_M_UC-G$Hispanic_M_UC-G$White_M_UC-G$Black_M_UC
  G$Other_F_UC<-G$Total_F_UC-G$Asian_F_UC-G$Hispanic_F_UC-G$White_F_UC-G$Black_F_UC
  G<-G[,grepl("YEAR|HS_|(White|Asian|Hispanic|Black)_[MF]_UC",names(G))] #|Other
  G<-melt(G,id.vars=c("HS_County","HS_District","HS_School","YEARAPAY"))
  names(G)[5:6]<-c("EthGen","NumGrads") ; G$EthGen<-gsub("_UC","",G$EthGen)
#Merge in number of apps
a<-A[!is.na(A$HS_County)&A$CQUARTERAP==2&A$SCHTYPE=="A"&A$adm_type=="Freshman"&A$SEX%in%c("F","M"),]
  a$AppAll<-a$NotIneligible ; a$AppSel<-(a$APP01==1|a$APP04==1)&a$NotIneligible
  a1<-aggregate(.~HS_County+HS_District+HS_School+YEARAPAY+EthGen,a[,c("HS_County","HS_District","HS_School","YEARAPAY","EthGen","AppAll","AppSel")],sum)
  a2<-merge(a1,G)
  names(a2)[6:8]<-c("NumApps_HS","NumSelApps_HS","NumGrads_HS")
G<-a2 ; save(G,file=paste0(secure_derived,"Graduation_CleanedAA.Rda"))
A<-merge(A,a2,all.x=T)


###
#  Merge in institutional value-added statistics
###
load(paste0(secure_derived,"NSC_ValueAdded_Measures.Rda"))
A<-merge(A,FEs[!is.na(FEs$NSC_EnrFirstIncCC_College_Code),grepl("_Code$|FE_[A-Z]",names(FEs))],all.x=T)
#Note: Right now, using predicted ethnicity for ethnicity-specific VA
  A$NSC_EnrFirstIncCC_College_Code_Eth<-paste(A$NSC_EnrFirstIncCC_College_Code,"-",A$CETHNICA_Cat_Pred)
  A$NSC_EnrFirstIncCC_College_Code_Eth[A$CETHNICA_Cat_Pred%in%c("Decline")]<-NA
  names(FEs)[grepl("^FE_",names(FEs))]<-paste0(names(FEs)[grepl("^FE_",names(FEs))],"_Eth")
A<-merge(A,FEs[!is.na(FEs$NSC_EnrFirstIncCC_College_Code_Eth),grepl("_Eth$",names(FEs))],all.x=T)
for(v in names(A)[grepl("^FE_.*GradVA",names(A))]) A[,v]<-A[,v]*100

###
#  Calculate applicant HS rank
###
temp<-A[!duplicated(A[,c("hsgpa","YEARAPAY","CPREVSCH")])&!is.na(A$hsgpa),c("hsgpa","YEARAPAY","CPREVSCH")]
  temp<-temp[order(temp$CPREVSCH,temp$YEARAPAY,temp$hsgpa,decreasing = T),]
  temp<-ddply(temp,.(CPREVSCH,YEARAPAY),function(x){
    x$hsgpa_rank<-1:nrow(x)
    return(x)
  })
A<-merge(A,temp,all.x=T)


###
#  Replace AL_KEY with random ID
###
hi<-A[order(A$AL_KEY),c("AL_KEY","ID")] ; hi<-hi[!duplicated(hi$AL_KEY),] ; hi$ID<-1:nrow(hi) ; names(hi)[2]<-"New_ID"
  A<-merge(A,hi) ; A$AL_KEY<-A$New_ID ; A<-A[,!names(A)=="New_ID"]

###
#  Create course dataset using UC-CHP student database
###
if(Clean_Course_Data){
  mergeToData<-function(s,a){ #Applicant - Student link by name and birth date
    hi<-merge(a,s)
    a<-a[!a$AL_KEY%in%hi$AL_KEY,] ; s<-s[!s$SID%in%hi$SID,] ; hi<-hi[!allduplicated(hi$AL_KEY),]
    s<-rename(s,FName_S=FName) ; hi<-rbind.fill(hi,merge(a,s))
    hi<-hi[!allduplicated(hi$AL_KEY),] ; a<-a[!a$AL_KEY%in%hi$AL_KEY,] ; s<-s[!s$SID%in%hi$SID,] #Don't throw out duplicates if they can be uniquely matched below.
    s<-rename(s,LName_S=LName,FName=FName_S) ; hi<-rbind.fill(hi,merge(a,s))
    hi<-hi[!allduplicated(hi$AL_KEY),] ; a<-a[!a$AL_KEY%in%hi$AL_KEY,] ; s<-s[!s$SID%in%hi$SID,]
    s<-rename(s,BDay_S=BDay,BMonth_S=BMonth,BYear_S=BYear,LName=LName_S) ; hi<-rbind.fill(hi,merge(a,s))
    hi<-hi[!allduplicated(hi$AL_KEY),] ; a<-a[!a$AL_KEY%in%hi$AL_KEY,] ; s<-s[!s$SID%in%hi$SID,]
    return(hi[,!grepl("Name|BMonth|BDay|BYear",names(hi))])
  }
  
  CC<-data.frame()
  
  ## Berkeley
  
  load(paste0(secure_ucb,"Digital_Student_Data.Rda"))
  load(paste0(secure_ucb,"Digital_Course_Data.Rda"))
  hi<-mergeToData(S[S$UG,c("FName","LName","BMonth","BDay","BYear","SID")][!duplicated(S$SID[S$UG]),],A[A$ENR01_Orig==1&A$YEARAPAY<2003,c("AL_KEY","FName","LName","BMonth","BDay","BYear","URM","YEARAPAY")]) #Works well; missing less than 1%
  C<-merge(C,hi)
    C<-C[C$UG,]
    C$Course<-paste(C$Department,C$Course_Number)
    C$GPA[grepl("A",C$Grade)]<-4 ; C$GPA[grepl("B",C$Grade)]<-3 ; C$GPA[grepl("C",C$Grade)]<-2 ; C$GPA[grepl("D",C$Grade)]<-1 ; C$GPA[grepl("F",C$Grade)]<-0
    C$GPA[grepl("[+]",C$Grade)]<-C$GPA[grepl("[+]",C$Grade)]+0.3 ; C$GPA[grepl("[-]",C$Grade)]<-C$GPA[grepl("[-]",C$Grade)]-0.3
    C$Pass<-NA_to_F(grepl("^[ABCDSP]",C$Grade))
    C<-C[,c("Year","Term","Course","GPA","Pass","SID","AL_KEY","URM","YEARAPAY")] ; C$Campus_ID<-1
  CC<-rbind(CC,C)
  
  ##Davis
  
  load(paste0(secure_ucd,"UCD_Digital_Data.Rda"))
    S$BDay<-as.integer(substr(S$BDate,1,2)) ; S$BMonth<-dplyr::recode(substr(S$BDate,4,6),JAN=1,FEB=2,MAR=3,APR=4,MAY=5,JUN=6,JUL=7,AUG=8,SEP=9,OCT=10,NOV=11,DEC=12) ; S$BYear<-as.integer(substr(S$BDate,8,9)) ; S$BYear<-S$BYear+1900+100*(S$BYear<5)
  hi<-mergeToData(S[,c("FName","LName","BMonth","BDay","BYear","SID")][!duplicated(S$SID),],A[A$ENR03_Orig==1&A$YEARAPAY<2003,c("AL_KEY","FName","LName","BMonth","BDay","BYear","URM","YEARAPAY")]) #Again missing less than 1%
  C<-merge(C,hi)
    C$Course<-paste(C$Department,C$Course_Number)
    C$Year<-C$Year_Cal
    C$GPA[grepl("A",C$Grade)]<-4 ; C$GPA[grepl("B",C$Grade)]<-3 ; C$GPA[grepl("C",C$Grade)]<-2 ; C$GPA[grepl("D",C$Grade)]<-1 ; C$GPA[grepl("F",C$Grade)]<-0
    C$GPA[grepl("[+]",C$Grade)]<-C$GPA[grepl("[+]",C$Grade)]+0.3 ; C$GPA[grepl("[-]",C$Grade)]<-C$GPA[grepl("[-]",C$Grade)]-0.3
    C$Pass<-NA_to_F(grepl("^[ABCDSP]",C$Grade))
    C<-C[,c("Year","Term","Course","GPA","Pass","SID","AL_KEY","URM","YEARAPAY")] ; C$Campus_ID<-3
  CC<-rbind(CC,C)
  
  ##Riverside
  
  load(paste0(secure_ucr,"UCR_Digital_Data.Rda"))
    S$BDay<-as.integer(substr(S$BDate,9,10)) ; S$BMonth<-as.integer(substr(S$BDate,6,7)) ; S$BYear<-as.integer(substr(S$BDate,1,4))
    S$LName<-titleCase(S$LName) ; S$FName<-titleCase(S$FName)
  hi<-mergeToData(S[,c("FName","LName","BMonth","BDay","BYear","SID")][!duplicated(S$SID),],A[A$ENR05_Orig==1&A$YEARAPAY<2003,c("AL_KEY","FName","LName","BMonth","BDay","BYear","URM","YEARAPAY")]) #1 percent
  C<-merge(C,hi)
    C$Course<-paste(C$Department,C$Course_Number)
    C$GPA[grepl("A",C$Grade)]<-4 ; C$GPA[grepl("B",C$Grade)]<-3 ; C$GPA[grepl("C",C$Grade)]<-2 ; C$GPA[grepl("D",C$Grade)]<-1 ; C$GPA[grepl("F",C$Grade)]<-0
    C$GPA[grepl("[+]",C$Grade)]<-C$GPA[grepl("[+]",C$Grade)]+0.3 ; C$GPA[grepl("[-]",C$Grade)]<-C$GPA[grepl("[-]",C$Grade)]-0.3
    C$Year<-C$Year_Cal
    C$Pass<-NA_to_F(grepl("^[ABCDSP]",C$Grade))
    C<-C[,c("Year","Term","Course","GPA","Pass","SID","AL_KEY","URM","YEARAPAY")] ; C$Campus_ID<-5
  CC<-rbind(CC,C)
  
  ##Santa Cruz
  
  load(paste0(secure_ucsc,"UnoffTrans_Data_Cleaned.Rda"))
    S$SID<-gsub("-","",S$SSN)
  hi<-mergeToData(S[,c("FName","LName","BMonth","BDay","BYear","SID")][!duplicated(S$SID),],A[A$ENR07_Orig==1&A$YEARAPAY<2003,c("AL_KEY","FName","LName","BMonth","BDay","BYear","URM","YEARAPAY")]) #Near-perfect
    C$SID<-gsub("-","",C$SSN)
  C<-merge(C,hi)
    C$Term<-dplyr::recode(C$Term,FALL="Fall",SPRING="Spring",WINTER="Winter")
    C$Course<-paste(C$Department,C$Course_Number)
    C$GPA[grepl("A",C$Grade)]<-4 ; C$GPA[grepl("B",C$Grade)]<-3 ; C$GPA[grepl("C",C$Grade)]<-2 ; C$GPA[grepl("D",C$Grade)]<-1 ; C$GPA[grepl("F",C$Grade)]<-0
    C$GPA[grepl("[+]",C$Grade)]<-C$GPA[grepl("[+]",C$Grade)]+0.3 ; C$GPA[grepl("[-]",C$Grade)]<-C$GPA[grepl("[-]",C$Grade)]-0.3
    C$Pass<-NA_to_F(grepl("^[ABCDSP]",C$Grade))
    C<-C[,c("Year","Term","Course","GPA","Pass","SID","AL_KEY","URM","YEARAPAY")] ; C$Campus_ID<-7
  CC<-rbind(CC,C)
  
  ##Santa Barbara
  
  load(paste0(secure_ucsb,"UCSB_Digital_Data.Rda"))
  hi<-mergeToData(S[,c("FName","LName","BMonth","BDay","BYear","SID")][!duplicated(S$SID),],A[A$ENR08_Orig==1&A$YEARAPAY<2003,c("AL_KEY","FName","LName","BMonth","BDay","BYear","URM","YEARAPAY")]) #Less than 1%
  C<-merge(C,hi)
    C$Course<-paste(C$Department,C$Course_Number)
    C$GPA[grepl("A",C$Grade)]<-4 ; C$GPA[grepl("B",C$Grade)]<-3 ; C$GPA[grepl("C",C$Grade)]<-2 ; C$GPA[grepl("D",C$Grade)]<-1 ; C$GPA[grepl("F",C$Grade)]<-0
    C$GPA[grepl("[+]",C$Grade)]<-C$GPA[grepl("[+]",C$Grade)]+0.3 ; C$GPA[grepl("[-]",C$Grade)]<-C$GPA[grepl("[-]",C$Grade)]-0.3
    C$Pass<-NA_to_F(grepl("^[ABCDSP]",C$Grade))
    C<-C[,c("Year","Term","Course","GPA","Pass","SID","AL_KEY","URM","YEARAPAY")] ; C$Campus_ID<-8
  CC<-rbind(CC,C)
  
  CC$Course<-paste(CC$Campus_ID,CC$Course)
  
  C<-CC ; rm(CC)
  save(C,file=paste0(secure_derived,"AffAct_Data_Courses.Rda"))
}

A<-A[,!names(A)%in%c("LKEY","Name","FName","MInit","MName","LName","BMonth","BDay","PERM_STREET","ID","ELC_KEY")] #Anonymize data

save(A,file=paste0(secure_derived,"AffAct_Data.Rda"))